import numpy as np
import tensorflow.compat.v2 as tf
tf.enable_v2_behavior()
import pandas as pd
from tensorflow import keras
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import MinMaxScaler
from matplotlib import pyplot
import plotly.graph_objects as go
import math
import seaborn as sns
from sklearn.metrics import mean_squared_error
np.random.seed(1)
tf.random.set_seed(1)
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, GRU, Dropout, RepeatVector, TimeDistributed
from keras import backend
MODELFILENAME = 'MODELS/LSTM_1h_TFM'
TIME_STEPS=6 #1h
CMODEL = LSTM
UNITS=40
DROPOUT=0.405
ACTIVATION='tanh'
OPTIMIZER='adam'
EPOCHS=68
BATCHSIZE=45
VALIDATIONSPLIT=0.1
# Code to read csv file into Colaboratory:
# from google.colab import files
# uploaded = files.upload()
# import io
# df = pd.read_csv(io.BytesIO(uploaded['SentDATA.csv']))
# Dataset is now stored in a Pandas Dataframe
df = pd.read_csv('../../data/dadesTFM.csv')
df.reset_index(inplace=True)
df['Time'] = pd.to_datetime(df['Time'])
df = df.set_index('Time')
columns = ['PM1','PM25','PM10','PM1ATM','PM25ATM','PM10ATM']
df1 = df.copy();
df1 = df1.rename(columns={"PM 1":"PM1","PM 2.5":"PM25","PM 10":"PM10","PM 1 ATM":"PM1ATM","PM 2.5 ATM":"PM25ATM","PM 10 ATM":"PM10ATM"})
df1['PM1'] = df['PM 1'].astype(np.float32)
df1['PM25'] = df['PM 2.5'].astype(np.float32)
df1['PM10'] = df['PM 10'].astype(np.float32)
df1['PM1ATM'] = df['PM 1 ATM'].astype(np.float32)
df1['PM25ATM'] = df['PM 2.5 ATM'].astype(np.float32)
df1['PM10ATM'] = df['PM 10 ATM'].astype(np.float32)
df2 = df1.copy()
train_size = int(len(df2) * 0.8)
test_size = len(df2) - train_size
train, test = df2.iloc[0:train_size], df2.iloc[train_size:len(df2)]
train.shape, test.shape
((3117, 7), (780, 7))
#Standardize the data
for col in columns:
scaler = StandardScaler()
train[col] = scaler.fit_transform(train[[col]])
<ipython-input-6-83cecdbc25f8>:4: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy train[col] = scaler.fit_transform(train[[col]]) <ipython-input-6-83cecdbc25f8>:4: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy train[col] = scaler.fit_transform(train[[col]]) <ipython-input-6-83cecdbc25f8>:4: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy train[col] = scaler.fit_transform(train[[col]]) <ipython-input-6-83cecdbc25f8>:4: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy train[col] = scaler.fit_transform(train[[col]]) <ipython-input-6-83cecdbc25f8>:4: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy train[col] = scaler.fit_transform(train[[col]]) <ipython-input-6-83cecdbc25f8>:4: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy train[col] = scaler.fit_transform(train[[col]])
def create_sequences(X, y, time_steps=TIME_STEPS):
Xs, ys = [], []
for i in range(len(X)-time_steps):
Xs.append(X.iloc[i:(i+time_steps)].values)
ys.append(y.iloc[i+time_steps])
return np.array(Xs), np.array(ys)
X_train, y_train = create_sequences(train[[columns[1]]], train[columns[1]])
#X_test, y_test = create_sequences(test[[columns[1]]], test[columns[1]])
print(f'X_train shape: {X_train.shape}')
print(f'y_train shape: {y_train.shape}')
X_train shape: (3111, 6, 1) y_train shape: (3111,)
#afegir nova mètrica
def rmse(y_true, y_pred):
return backend.sqrt(backend.mean(backend.square(y_pred - y_true), axis=-1))
model = Sequential()
model.add(CMODEL(units = UNITS, return_sequences=True, input_shape=(X_train.shape[1], X_train.shape[2])))
model.add(Dropout(rate=DROPOUT))
model.add(TimeDistributed(Dense(1,kernel_initializer='normal',activation=ACTIVATION)))
model.compile(optimizer=OPTIMIZER, loss='mae',metrics=['mse',rmse])
model.summary()
Model: "sequential" _________________________________________________________________ Layer (type) Output Shape Param # ================================================================= lstm (LSTM) (None, 6, 40) 6720 _________________________________________________________________ dropout (Dropout) (None, 6, 40) 0 _________________________________________________________________ time_distributed (TimeDistri (None, 6, 1) 41 ================================================================= Total params: 6,761 Trainable params: 6,761 Non-trainable params: 0 _________________________________________________________________
history = model.fit(X_train, y_train, epochs=EPOCHS, batch_size=BATCHSIZE, validation_split=VALIDATIONSPLIT,
callbacks=[keras.callbacks.EarlyStopping(monitor='val_loss', patience=5, mode='min')], shuffle=False)
Epoch 1/68 63/63 [==============================] - 1s 10ms/step - loss: 0.7386 - mse: 0.9143 - rmse: 0.7405 - val_loss: 0.7040 - val_mse: 0.5325 - val_rmse: 0.7128 Epoch 2/68 63/63 [==============================] - 0s 3ms/step - loss: 0.5174 - mse: 0.5552 - rmse: 0.5539 - val_loss: 0.3616 - val_mse: 0.1981 - val_rmse: 0.4359 Epoch 3/68 63/63 [==============================] - 0s 3ms/step - loss: 0.4576 - mse: 0.4551 - rmse: 0.5065 - val_loss: 0.3166 - val_mse: 0.1570 - val_rmse: 0.3866 Epoch 4/68 63/63 [==============================] - 0s 3ms/step - loss: 0.4287 - mse: 0.4183 - rmse: 0.4712 - val_loss: 0.2744 - val_mse: 0.1208 - val_rmse: 0.3366 Epoch 5/68 63/63 [==============================] - 0s 3ms/step - loss: 0.3999 - mse: 0.3895 - rmse: 0.4380 - val_loss: 0.2295 - val_mse: 0.0909 - val_rmse: 0.2890 Epoch 6/68 63/63 [==============================] - 0s 3ms/step - loss: 0.3837 - mse: 0.3716 - rmse: 0.4204 - val_loss: 0.2103 - val_mse: 0.0767 - val_rmse: 0.2627 Epoch 7/68 63/63 [==============================] - 0s 3ms/step - loss: 0.3742 - mse: 0.3614 - rmse: 0.4091 - val_loss: 0.1939 - val_mse: 0.0662 - val_rmse: 0.2414 Epoch 8/68 63/63 [==============================] - 0s 3ms/step - loss: 0.3666 - mse: 0.3538 - rmse: 0.3999 - val_loss: 0.1854 - val_mse: 0.0597 - val_rmse: 0.2266 Epoch 9/68 63/63 [==============================] - 0s 3ms/step - loss: 0.3605 - mse: 0.3480 - rmse: 0.3920 - val_loss: 0.1737 - val_mse: 0.0535 - val_rmse: 0.2117 Epoch 10/68 63/63 [==============================] - 0s 3ms/step - loss: 0.3567 - mse: 0.3437 - rmse: 0.3869 - val_loss: 0.1689 - val_mse: 0.0497 - val_rmse: 0.2015 Epoch 11/68 63/63 [==============================] - 0s 3ms/step - loss: 0.3524 - mse: 0.3399 - rmse: 0.3815 - val_loss: 0.1597 - val_mse: 0.0455 - val_rmse: 0.1895 Epoch 12/68 63/63 [==============================] - 0s 4ms/step - loss: 0.3491 - mse: 0.3373 - rmse: 0.3771 - val_loss: 0.1553 - val_mse: 0.0430 - val_rmse: 0.1818 Epoch 13/68 63/63 [==============================] - 0s 4ms/step - loss: 0.3464 - mse: 0.3345 - rmse: 0.3737 - val_loss: 0.1526 - val_mse: 0.0413 - val_rmse: 0.1759 Epoch 14/68 63/63 [==============================] - 0s 3ms/step - loss: 0.3451 - mse: 0.3339 - rmse: 0.3722 - val_loss: 0.1488 - val_mse: 0.0395 - val_rmse: 0.1699 Epoch 15/68 63/63 [==============================] - 0s 4ms/step - loss: 0.3429 - mse: 0.3318 - rmse: 0.3697 - val_loss: 0.1464 - val_mse: 0.0384 - val_rmse: 0.1655 Epoch 16/68 63/63 [==============================] - 0s 3ms/step - loss: 0.3422 - mse: 0.3317 - rmse: 0.3688 - val_loss: 0.1439 - val_mse: 0.0374 - val_rmse: 0.1616 Epoch 17/68 63/63 [==============================] - 0s 3ms/step - loss: 0.3402 - mse: 0.3297 - rmse: 0.3664 - val_loss: 0.1427 - val_mse: 0.0368 - val_rmse: 0.1589 Epoch 18/68 63/63 [==============================] - 0s 3ms/step - loss: 0.3385 - mse: 0.3292 - rmse: 0.3650 - val_loss: 0.1396 - val_mse: 0.0359 - val_rmse: 0.1552 Epoch 19/68 63/63 [==============================] - 0s 4ms/step - loss: 0.3383 - mse: 0.3290 - rmse: 0.3649 - val_loss: 0.1392 - val_mse: 0.0357 - val_rmse: 0.1538 Epoch 20/68 63/63 [==============================] - 0s 3ms/step - loss: 0.3369 - mse: 0.3277 - rmse: 0.3633 - val_loss: 0.1385 - val_mse: 0.0354 - val_rmse: 0.1525 Epoch 21/68 63/63 [==============================] - 0s 4ms/step - loss: 0.3362 - mse: 0.3270 - rmse: 0.3627 - val_loss: 0.1376 - val_mse: 0.0352 - val_rmse: 0.1513 Epoch 22/68 63/63 [==============================] - 0s 3ms/step - loss: 0.3358 - mse: 0.3273 - rmse: 0.3622 - val_loss: 0.1359 - val_mse: 0.0348 - val_rmse: 0.1496 Epoch 23/68 63/63 [==============================] - 0s 3ms/step - loss: 0.3356 - mse: 0.3269 - rmse: 0.3622 - val_loss: 0.1363 - val_mse: 0.0349 - val_rmse: 0.1495 Epoch 24/68 63/63 [==============================] - 0s 3ms/step - loss: 0.3350 - mse: 0.3261 - rmse: 0.3615 - val_loss: 0.1356 - val_mse: 0.0348 - val_rmse: 0.1487 Epoch 25/68 63/63 [==============================] - 0s 3ms/step - loss: 0.3349 - mse: 0.3260 - rmse: 0.3617 - val_loss: 0.1347 - val_mse: 0.0345 - val_rmse: 0.1476 Epoch 26/68 63/63 [==============================] - 0s 3ms/step - loss: 0.3344 - mse: 0.3263 - rmse: 0.3615 - val_loss: 0.1352 - val_mse: 0.0347 - val_rmse: 0.1480 Epoch 27/68 63/63 [==============================] - 0s 3ms/step - loss: 0.3337 - mse: 0.3259 - rmse: 0.3605 - val_loss: 0.1352 - val_mse: 0.0347 - val_rmse: 0.1480 Epoch 28/68 63/63 [==============================] - 0s 3ms/step - loss: 0.3336 - mse: 0.3255 - rmse: 0.3606 - val_loss: 0.1354 - val_mse: 0.0347 - val_rmse: 0.1481 Epoch 29/68 63/63 [==============================] - 0s 3ms/step - loss: 0.3339 - mse: 0.3255 - rmse: 0.3608 - val_loss: 0.1337 - val_mse: 0.0343 - val_rmse: 0.1466 Epoch 30/68 63/63 [==============================] - 0s 4ms/step - loss: 0.3334 - mse: 0.3258 - rmse: 0.3602 - val_loss: 0.1343 - val_mse: 0.0344 - val_rmse: 0.1470 Epoch 31/68 63/63 [==============================] - 0s 3ms/step - loss: 0.3341 - mse: 0.3257 - rmse: 0.3608 - val_loss: 0.1347 - val_mse: 0.0346 - val_rmse: 0.1476 Epoch 32/68 63/63 [==============================] - 0s 3ms/step - loss: 0.3330 - mse: 0.3255 - rmse: 0.3603 - val_loss: 0.1339 - val_mse: 0.0344 - val_rmse: 0.1468 Epoch 33/68 63/63 [==============================] - 0s 3ms/step - loss: 0.3331 - mse: 0.3245 - rmse: 0.3598 - val_loss: 0.1326 - val_mse: 0.0341 - val_rmse: 0.1456 Epoch 34/68 63/63 [==============================] - 0s 3ms/step - loss: 0.3338 - mse: 0.3253 - rmse: 0.3607 - val_loss: 0.1339 - val_mse: 0.0344 - val_rmse: 0.1466 Epoch 35/68 63/63 [==============================] - 0s 3ms/step - loss: 0.3333 - mse: 0.3248 - rmse: 0.3601 - val_loss: 0.1334 - val_mse: 0.0344 - val_rmse: 0.1465 Epoch 36/68 63/63 [==============================] - 0s 4ms/step - loss: 0.3328 - mse: 0.3248 - rmse: 0.3594 - val_loss: 0.1333 - val_mse: 0.0342 - val_rmse: 0.1462 Epoch 37/68 63/63 [==============================] - 0s 4ms/step - loss: 0.3329 - mse: 0.3251 - rmse: 0.3598 - val_loss: 0.1327 - val_mse: 0.0341 - val_rmse: 0.1457 Epoch 38/68 63/63 [==============================] - 0s 4ms/step - loss: 0.3328 - mse: 0.3247 - rmse: 0.3595 - val_loss: 0.1335 - val_mse: 0.0342 - val_rmse: 0.1462
import matplotlib.pyplot as plt
plt.plot(history.history['loss'], label='MAE Training loss')
plt.plot(history.history['val_loss'], label='MAE Validation loss')
plt.plot(history.history['mse'], label='MSE Training loss')
plt.plot(history.history['val_mse'], label='MSE Validation loss')
plt.plot(history.history['rmse'], label='RMSE Training loss')
plt.plot(history.history['val_rmse'], label='RMSE Validation loss')
plt.legend();
X_train_pred = model.predict(X_train, verbose=0)
train_mae_loss = np.mean(np.abs(X_train_pred - X_train), axis=1)
plt.hist(train_mae_loss, bins=50)
plt.xlabel('Train MAE loss')
plt.ylabel('Number of Samples');
def evaluate_prediction(predictions, actual, model_name):
errors = predictions - actual
mse = np.square(errors).mean()
rmse = np.sqrt(mse)
mae = np.abs(errors).mean()
print(model_name + ':')
print('Mean Absolute Error: {:.4f}'.format(mae))
print('Root Mean Square Error: {:.4f}'.format(rmse))
print('Mean Square Error: {:.4f}'.format(mse))
print('')
return mae,rmse,mse
mae,rmse,mse = evaluate_prediction(X_train_pred, X_train,"LSTM")
LSTM: Mean Absolute Error: 0.1854 Root Mean Square Error: 0.4315 Mean Square Error: 0.1862
model.save(MODELFILENAME+'.h5')
#càlcul del threshold de test
def calculate_threshold(X_test, X_test_pred):
distance = np.sqrt(np.mean(np.square(X_test_pred - X_test),axis=1))
"""Sorting the scores/diffs and using a 0.80 as cutoff value to pick the threshold"""
distance.sort();
cut_off = int(0.9 * len(distance));
threshold = distance[cut_off];
return threshold
for col in columns:
print ("####################### "+col +" ###########################")
#Standardize the test data
scaler = StandardScaler()
test_cpy = test.copy()
test[col] = scaler.fit_transform(test[[col]])
#creem seqüencia amb finestra temporal per les dades de test
X_test1, y_test1 = create_sequences(test[[col]], test[col])
print(f'Testing shape: {X_test1.shape}')
#evaluem el model
eval = model.evaluate(X_test1, y_test1)
print("evaluate: ",eval)
#predim el model
X_test1_pred = model.predict(X_test1, verbose=0)
evaluate_prediction(X_test1_pred, X_test1,"LSTM")
#càlcul del mae_loss
test1_mae_loss = np.mean(np.abs(X_test1_pred - X_test1), axis=1)
test1_rmse_loss = np.sqrt(np.mean(np.square(X_test1_pred - X_test1),axis=1))
# reshaping test prediction
X_test1_predReshape = X_test1_pred.reshape((X_test1_pred.shape[0] * X_test1_pred.shape[1]), X_test1_pred.shape[2])
# reshaping test data
X_test1Reshape = X_test1.reshape((X_test1.shape[0] * X_test1.shape[1]), X_test1.shape[2])
threshold_test = calculate_threshold(X_test1Reshape,X_test1_predReshape)
test1_score_df = pd.DataFrame(test[TIME_STEPS:])
test1_score_df['loss'] = test1_rmse_loss.reshape((-1))
test1_score_df['threshold'] = threshold_test
test1_score_df['anomaly'] = test1_score_df['loss'] > test1_score_df['threshold']
test1_score_df[col] = test[TIME_STEPS:][col]
#gràfic test lost i threshold
fig = go.Figure()
fig.add_trace(go.Scatter(x=test1_score_df.index, y=test1_score_df['loss'], name='Test loss'))
fig.add_trace(go.Scatter(x=test1_score_df.index, y=test1_score_df['threshold'], name='Threshold'))
fig.update_layout(showlegend=True, title='Test loss vs. Threshold')
fig.show()
#Posem les anomalies en un array
anomalies1 = test1_score_df.loc[test1_score_df['anomaly'] == True]
anomalies1.shape
print('anomalies: ',anomalies1.shape); print();
#Gràfic dels punts i de les anomalíes amb els valors de dades transformades per verificar que la normalització que s'ha fet no distorssiona les dades
fig = go.Figure()
fig.add_trace(go.Scatter(x=test1_score_df.index, y=scaler.inverse_transform(test1_score_df[col]), name=col))
fig.add_trace(go.Scatter(x=anomalies1.index, y=scaler.inverse_transform(anomalies1[col]), mode='markers', name='Anomaly'))
fig.update_layout(showlegend=True, title='Detected anomalies')
fig.show()
print ("######################################################")
####################### PM1 ########################### Testing shape: (774, 6, 1) 25/25 [==============================] - 0s 1ms/step - loss: 0.3818 - mse: 0.6058 - rmse: 0.4202 evaluate: [0.38180479407310486, 0.6057915687561035, 0.4201723635196686]
<ipython-input-17-48420fb1aa44>:8: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy test[col] = scaler.fit_transform(test[[col]])
LSTM: Mean Absolute Error: 0.1750 Root Mean Square Error: 0.5831 Mean Square Error: 0.3400
anomalies: (117, 10)
###################################################### ####################### PM25 ########################### Testing shape: (774, 6, 1) 25/25 [==============================] - 0s 1ms/step - loss: 0.3955 - mse: 0.5459 - rmse: 0.4342 evaluate: [0.3954828083515167, 0.5458711385726929, 0.43419840931892395]
<ipython-input-17-48420fb1aa44>:8: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
LSTM: Mean Absolute Error: 0.1851 Root Mean Square Error: 0.5410 Mean Square Error: 0.2927
anomalies: (95, 10)
###################################################### ####################### PM10 ########################### Testing shape: (774, 6, 1) 25/25 [==============================] - 0s 1ms/step - loss: 0.4055 - mse: 0.5153 - rmse: 0.4456
<ipython-input-17-48420fb1aa44>:8: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
evaluate: [0.40549609065055847, 0.5152989029884338, 0.4456019699573517] LSTM: Mean Absolute Error: 0.1911 Root Mean Square Error: 0.5031 Mean Square Error: 0.2531
anomalies: (93, 10)
###################################################### ####################### PM1ATM ########################### Testing shape: (774, 6, 1) 25/25 [==============================] - 0s 1ms/step - loss: 0.4064 - mse: 0.5437 - rmse: 0.4475 evaluate: [0.40641579031944275, 0.5436993837356567, 0.4474876821041107] LSTM: Mean Absolute Error: 0.1849 Root Mean Square Error: 0.4964 Mean Square Error: 0.2464
<ipython-input-17-48420fb1aa44>:8: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
anomalies: (91, 10)
###################################################### ####################### PM25ATM ########################### Testing shape: (774, 6, 1) 25/25 [==============================] - 0s 1ms/step - loss: 0.4027 - mse: 0.5519 - rmse: 0.4434 evaluate: [0.40272095799446106, 0.5518580675125122, 0.4433588683605194] LSTM: Mean Absolute Error: 0.1841 Root Mean Square Error: 0.5086 Mean Square Error: 0.2587
<ipython-input-17-48420fb1aa44>:8: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
anomalies: (92, 10)
###################################################### ####################### PM10ATM ########################### Testing shape: (774, 6, 1) 25/25 [==============================] - 0s 1ms/step - loss: 0.4035 - mse: 0.5317 - rmse: 0.4433 evaluate: [0.40351808071136475, 0.5317376255989075, 0.44330960512161255]
<ipython-input-17-48420fb1aa44>:8: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
LSTM: Mean Absolute Error: 0.1894 Root Mean Square Error: 0.5146 Mean Square Error: 0.2649
anomalies: (89, 10)
######################################################